library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(gvlma)
library(car)
## Loading required package: carData
We are going to look at the property prices for Orange County. From there we are going to try and predict the sales price by linear regression.
Bring in the data and make sure the data types are correct. If not, make the proper changes. The file is located within this project. data/prop_prices_reduced.csv
prices <- read.csv("data\\prop_prices_reduced.csv")
str(prices)
## 'data.frame': 1000 obs. of 8 variables:
## $ sale_def : int 88142 78046 273777 229185 464029 109152 190090 259402 139069 204245 ...
## $ bed : int 4 2 4 4 4 3 1 3 3 4 ...
## $ bath : num 2 2 2 3 3 2 1 2 2.5 2 ...
## $ area_heated: int 1270 1037 2821 2341 2981 1307 960 2010 1873 1701 ...
## $ area : int 3631 1307 10309 9578 12287 10088 269496 17270 9858 5658 ...
## $ dist_cbd : num 5212 14707 20898 7057 7772 ...
## $ dist_lakes : num 639 199 3799 193 226 ...
## $ pool : int 0 0 0 0 1 1 0 0 0 0 ...
Plot histograms for all variables. Additionally, add scatterplots for the relationships between all quantitative variables.
hist(prices$sale_def)
hist(prices$bed)
hist(prices$bath)
hist(prices$area_heated)
hist(prices$area)
hist(prices$dist_cbd)
hist(prices$dist_lakes)
hist(prices$pool)
plot_ly(prices, y = ~sale_def, x = ~bed, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~area_heated, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~area, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~bath, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~dist_lakes, x = ~area, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~area_heated, x = ~bath, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~area_heated, x = ~dist_lakes, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~dist_lakes, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
plot_ly(prices, y = ~sale_def, x = ~dist_cbd, type = 'scatter')
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Provide basic summary statistics for univariate analysis. Also, provide the correlation between all the quantitative variables.
data <- prices$sale_def
mean(data)
## [1] 200319.3
median(data)
## [1] 151085
max(data)
## [1] 7629992
min(data)
## [1] 15296
# What about the summary stats for the rest of the variables?
# What about sd/var?
quant_var <- data.frame(prices$sale_def,prices$bed,prices$bath,prices$area_heated,prices$area,
prices$dist_cbd,prices$dist_lakes)
cor(quant_var)
## prices.sale_def prices.bed prices.bath prices.area_heated
## prices.sale_def 1.00000000 0.32557109 0.58731596 0.69201080
## prices.bed 0.32557109 1.00000000 0.64449531 0.66911599
## prices.bath 0.58731596 0.64449531 1.00000000 0.83255359
## prices.area_heated 0.69201080 0.66911599 0.83255359 1.00000000
## prices.area 0.34354392 0.10437035 0.19298319 0.30184325
## prices.dist_cbd 0.05263700 0.23328277 0.24220637 0.25494517
## prices.dist_lakes -0.08857844 0.03747888 -0.02109154 -0.04092947
## prices.area prices.dist_cbd prices.dist_lakes
## prices.sale_def 0.34354392 0.05263700 -0.08857844
## prices.bed 0.10437035 0.23328277 0.03747888
## prices.bath 0.19298319 0.24220637 -0.02109154
## prices.area_heated 0.30184325 0.25494517 -0.04092947
## prices.area 1.00000000 0.08648327 -0.15432291
## prices.dist_cbd 0.08648327 1.00000000 0.26520451
## prices.dist_lakes -0.15432291 0.26520451 1.00000000
Run a regression with all the variables included. Print results of the regression.
model <- lm(sale_def ~ bed + bath + area_heated + area +
dist_cbd + dist_lakes + pool, data = prices)
summary(model)
##
## Call:
## lm(formula = sale_def ~ bed + bath + area_heated + area + dist_cbd +
## dist_lakes + pool, data = prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -765238 -55999 9774 63848 4954827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.890e+04 3.093e+04 -1.904 0.057170 .
## bed -8.492e+04 1.087e+04 -7.810 1.45e-14 ***
## bath 5.449e+04 1.507e+04 3.616 0.000315 ***
## area_heated 2.439e+02 1.430e+01 17.054 < 2e-16 ***
## area 3.548e+00 5.958e-01 5.955 3.61e-09 ***
## dist_cbd -5.848e+00 1.090e+00 -5.366 1.00e-07 ***
## dist_lakes 9.651e-01 3.810e+00 0.253 0.800091
## pool -2.776e+04 1.640e+04 -1.693 0.090787 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 194900 on 992 degrees of freedom
## Multiple R-squared: 0.547, Adjusted R-squared: 0.5438
## F-statistic: 171.2 on 7 and 992 DF, p-value: < 2.2e-16
Which of the variables tested significant at the 95% level? Looking at the results and answering outside of the chunk is sufficient. Answer: bed, bath, area_heated, area, dist_cbd
As is, are any of the Gauss-Markov assumptions violated? If so, which ones? How can you fix the issues?
gvmodel <- gvlma(model)
summary(gvmodel)
##
## Call:
## lm(formula = sale_def ~ bed + bath + area_heated + area + dist_cbd +
## dist_lakes + pool, data = prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -765238 -55999 9774 63848 4954827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.890e+04 3.093e+04 -1.904 0.057170 .
## bed -8.492e+04 1.087e+04 -7.810 1.45e-14 ***
## bath 5.449e+04 1.507e+04 3.616 0.000315 ***
## area_heated 2.439e+02 1.430e+01 17.054 < 2e-16 ***
## area 3.548e+00 5.958e-01 5.955 3.61e-09 ***
## dist_cbd -5.848e+00 1.090e+00 -5.366 1.00e-07 ***
## dist_lakes 9.651e-01 3.810e+00 0.253 0.800091
## pool -2.776e+04 1.640e+04 -1.693 0.090787 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 194900 on 992 degrees of freedom
## Multiple R-squared: 0.547, Adjusted R-squared: 0.5438
## F-statistic: 171.2 on 7 and 992 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model)
##
## Value p-value Decision
## Global Stat 7497980.4 0 Assumptions NOT satisfied!
## Skewness 45659.6 0 Assumptions NOT satisfied!
## Kurtosis 7450926.5 0 Assumptions NOT satisfied!
## Link Function 782.2 0 Assumptions NOT satisfied!
## Heteroscedasticity 612.1 0 Assumptions NOT satisfied!
I do not see explanations about the violations of the assumptions
Based off of your findings in the previous section, make changes to the variables, the functional form, etc.
outlierTest(model)
## rstudent unadjusted p-value Bonferroni p
## 37 57.115829 9.6866e-316 9.6866e-313
## 7 -6.567097 8.2674e-11 8.2674e-08
## 214 4.529055 6.6453e-06 6.6453e-03
newdata <- prices[-c(7, 37, 214), ]
newmodel <- lm(sale_def ~ bed + bath + area_heated + area +
dist_cbd + dist_lakes + pool, data = newdata)
Missing some sort of transformations to help.
Based on the following inputs, predict the deflated sales price:
Example1 <- data.frame(bed = 2,
bath = 2.0,
area_heated = 1223,
area = 9750,
dist_cbd = 19368,
dist_lakes = 490,
pool = 0)
predict(newmodel, Example1)
## 1
## 117543.7